Overview

Confidence intervals around individual’s scores can be estimated using the Standard Error of Measurement. Dudek (1979) defined this as \(SEM = SD\sqrt{1 - \rho}\), where \(SD\) refers to the standard deviation of scores on the measure and \(\rho\) refers to the reliability of the measure. Intervals can then be formed as usual using \(CI = estimate ± SEM*1.96\).

# dependencies
library(tidyverse)
library(knitr)
library(kableExtra)
library(boot)
library(parallel)
library(bayestestR)
library(patchwork)
library(mdthemes)
library(lme4)
library(sjPlot)
library(emmeans)
library(ggstance)
library(janitor)
# library(merTools) called via merTools:: to avoid namespace collisions between MASS and dplyr


# function to round all numeric vars in a data frame
round_df <- function(df, n_digits = 3) {
  df %>% mutate_if(is.numeric, janitor::round_half_up, digits = n_digits)
}

# standard error of measurement
SEM <- function(SD, reliability){
  SD * sqrt(1 - reliability)
} 

# table format in output
options(knitr.table.format = "html") 

Data

# data with bootstrapped confidence intervals
data_estimates_D <- read_csv("../data/processed/data_estimates_D.csv") %>%
  filter(method == "bca")

# data for SEM confidence intervals
data_reliability_internal <- 
  read_csv("../data/processed/data_D_scores_internal_consistency_permuted_estimates_trial_types.csv") |>
  select(domain, trial_type, reliability = alpha)

data_scored <- 
  read_csv("../data/processed/data_estimates_D.csv") |>
  filter(method == "bca") |>
  select(unique_id, 
         domain, 
         trial_type, 
         estimate)

Calculate SEM and Confidence Intervals

data_sd <- data_scored  |>
  group_by(domain, trial_type) |>
  summarize(SD = sd(estimate))

data_sem <- 
  full_join(data_reliability_internal, data_sd, by = c("domain", "trial_type")) |>
  mutate(SEM = SEM(SD, reliability))

data_estimates_D_sem <- data_scored |>
  left_join(data_sem, by = c("domain", "trial_type")) |>
  dplyr::mutate(ci_lower = estimate - (SEM*1.96),
                ci_upper = estimate + (SEM*1.96),
                ci_width = ci_upper - ci_lower,
                sig = ifelse((ci_lower > 0 & ci_upper > 0) |
                               (ci_lower < 0 & ci_upper < 0), TRUE, FALSE)) |>
  round_df(2)

CI widths

MAP CI width

Because widths are consistent within trial type and domain, only MAP widths are needed, not MAP-MAP widths as with bootstrapping.

Most probable estimate.

data_ci_widths <- data_estimates_D_sem %>%
  select(domain, trial_type, ci_width) %>%
  distinct(domain, trial_type, .keep_all = TRUE) %>%
  mutate(domain = fct_rev(domain))

data_ci_widths %>%
  do(point_estimate(.$ci_width, centrality = "MAP")) %>%
  round_df(2) %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
MAP
1.23

CI width

By domain and trial type

data_ci_widths %>%
  pivot_wider(names_from = trial_type, values_from = ci_width) %>%
  round_df(2) %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
domain tt1 tt2 tt3 tt4
Stigma - ADHD 1.10 1.10 1.15 1.09
Personality - Agreeableness 1.13 1.17 1.25 1.29
Body image 1.31 1.32 1.21 1.25
Friend-Enemy 1.12 1.18 1.20 1.18
Gender (1) 1.30 1.24 1.27 1.22
Lincoln-Hitler 1.11 1.11 1.11 1.13
Race (1) 1.22 1.25 1.21 1.28
Religion 1.09 1.28 1.21 1.28
Clinton-Trump 1.20 1.31 1.25 1.25
Valenced words 1.12 1.30 1.22 1.23
Personality - Conscientiousness 1.17 1.20 1.29 1.22
Countries (1) 1.19 1.30 1.28 1.26
Countries (2) 1.29 1.23 1.22 1.33
Disgust (2) 1.28 1.26 1.24 1.16
Disgust (1) 1.08 1.05 1.07 0.98
Personality - Extraversion 1.16 1.24 1.19 1.23
Death (1) 1.11 1.23 1.11 1.26
Death (2) 1.20 1.21 1.25 1.28
Death (3) 1.11 1.22 1.27 1.24
Race (2) 0.87 0.90 0.91 0.90
Gender (2) 1.21 1.22 1.22 1.28
Personality - Neuroticism 1.25 1.19 1.23 1.31
Personality - Openness 1.24 1.28 1.21 1.21
Stigma - PTSD 1.00 1.07 1.14 1.13
Rich-Poor 1.25 1.29 1.32 1.26
Stigma - Schizophrenia 1.11 1.06 1.05 1.09
Sexuality (2) 0.83 0.90 0.89 0.95
Sexuality (1) 0.92 1.00 0.95 0.98
Shapes & colors (1) 1.10 1.27 1.32 1.31
Shapes & colors (2) 1.14 1.35 1.09 1.06
Shapes & colors (3) 1.17 1.25 1.20 1.20
Shapes & colors (4) 1.18 1.24 1.14 1.22
Shapes & colors (5) 1.14 1.30 1.25 1.19
Shapes & colors (6) 1.09 1.27 1.34 1.19
Shapes & colors (7) 0.97 1.12 1.02 1.10

Plot by domain and trial type

p_ci_widths <- 
  ggplot(data_ci_widths, aes(ci_width, domain)) + 
  geom_point(position = position_dodge(width = 0.8)) +
  mdthemes::md_theme_linedraw() +
  facet_wrap(~ trial_type, ncol = 4, nrow = 1) +
  labs(x = "Highest probability (MAP) 95% CI width",
       y = "") + 
  theme(legend.position = "top")

p_ci_widths

Proportion different from zero

Caterpillar plot

By domain

p_cis_by_domain <- 
  data_estimates_D_sem %>%
  mutate(domain = str_replace(domain, "Personality - ", "Big5: "),
         domain = str_replace(domain, "Stigma - ", "Stigma: ")) %>%
  arrange(estimate) %>%
  group_by(domain) %>%
  mutate(ordered_id = row_number()/n()) %>%
  ungroup() %>%
  ggplot() +
  geom_linerange(aes(x = ordered_id, ymin = ci_lower, ymax = ci_upper, color = sig),
                 alpha = 1) +
  geom_point(aes(ordered_id, estimate), size = 0.5, shape = "square") +
  geom_hline(yintercept = 0, linetype = "dotted") +
  mdthemes::md_theme_linedraw() +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        legend.position = "top") +
  scale_color_viridis_d(end = 0.6, direction = -1) +
  xlab("Ranked participant") +
  ylab("*D* score") +
  labs(color = "95% CI excludes zero point") + 
  facet_wrap(~ domain, ncol = 5)

p_cis_by_domain

Calculate scores

data_diff_zero <- 
  bind_rows(mutate(data_estimates_D,     scoring_method = "*D* scores bootstrapped CIs"),
            mutate(data_estimates_D_sem, scoring_method = "*D* scores SEM CIs")) %>%
  mutate(domain = as.factor(domain),
         trial_type = case_when(trial_type == "tt1" ~ "Trial type 1",
                                trial_type == "tt2" ~ "Trial type 2",
                                trial_type == "tt3" ~ "Trial type 3",
                                trial_type == "tt4" ~ "Trial type 4"),
         trial_type = fct_relevel(trial_type, "Trial type 1", "Trial type 2", "Trial type 3", "Trial type 4")) %>%
  group_by(domain, trial_type, scoring_method) %>%
  summarize(proportion_diff_zero = mean(sig),
            variance = plotrix::std.error(sig)^2,
            .groups = "drop") %>%
  # model cannot be run on zero variance or 0 or 1 logit, so offset by a minuscule amount
  mutate(proportion_diff_zero_temp = case_when(proportion_diff_zero < 0.001 ~ 0.001, 
                                               proportion_diff_zero > 0.999 ~ 0.999,
                                               TRUE ~ proportion_diff_zero),
         proportion_diff_zero_logit = boot::logit(proportion_diff_zero_temp)) %>%
  select(-proportion_diff_zero_temp) %>%
  #filter(!(proportion_diff_zero == 0 & variance == 0)) %>%
  mutate(variance = ifelse(variance == 0, 0.0001, variance)) 

# # save to disk
#write_csv(data_diff_zero, "../data/results/data_diff_zero_irap_d_vs_pi.csv")
# data_diff_zero <- read_csv("../data/results/data_diff_zero_irap_d_vs_pi.csv")

Plot

p_diff_zero <- 
  data_diff_zero %>%
  mutate(domain = fct_rev(factor(domain))) %>%
  ggplot(aes(proportion_diff_zero, domain, color = scoring_method, shape = scoring_method)) +
  geom_linerangeh(aes(xmin = proportion_diff_zero - sqrt(variance)*1.96,
                      xmax = proportion_diff_zero + sqrt(variance)*1.96),
                  position = position_dodge(width = 0.75)) + 
  geom_point(position = position_dodge(width = 0.75)) +
  scale_shape_manual(labels = c("*D* scores bootstrapped CIs", "*D* scores SEM CIs"), values = c(15, 16)) +
  scale_color_viridis_d(begin = 0.3, end = 0.7, labels = c("*D* scores bootstrapped CIs", "*D* scores SEM CIs")) +
  mdthemes::md_theme_linedraw() +
  facet_wrap(~ trial_type, ncol = 4) +
  labs(x = "Proportion of scores<br/>different from zero point",
       y = "",
       color = "Scoring method",
       shape = "Scoring method") + 
  theme(legend.position = "top",
        panel.spacing = unit(1.5, "lines")) +
  coord_cartesian(xlim = c(0,1))

p_diff_zero

Model

# fit model
fit_diff_zero <- 
  lmer(proportion_diff_zero_logit ~ 1 + scoring_method + (1 | domain/trial_type),
       weights = 1/variance, 
       data = data_diff_zero,
       # solution from https://www.metafor-project.org/doku.php/tips:rma_vs_lm_lme_lmer
       control = lmerControl(check.nobs.vs.nlev = "ignore",  
                             check.nobs.vs.nRE = "ignore"))

# extract marginal means
results_emm_diff_zero <- 
  summary(emmeans(fit_diff_zero, ~ scoring_method)) %>%
  dplyr::select(scoring_method, estimate = emmean, se = SE, ci_lower = lower.CL, ci_upper = upper.CL)

# extract re Tau
results_re_tau_diff_zero <- fit_diff_zero %>%
  merTools::REsdExtract() %>%
  as_tibble(rownames = "trial_type") %>%
  rename(tau = value) 

# combine
results_diff_zero <- results_emm_diff_zero %>%
  # as in metafor package's implementation of prediction intervals, see metafor::predict.rma.R 
  mutate(pi_lower = estimate - (1.96 * sqrt(se^2 + results_re_tau_diff_zero$tau[2]^2)),   # [2] is variance for domain
         pi_upper = estimate + (1.96 * sqrt(se^2 + results_re_tau_diff_zero$tau[2]^2))) |>
  select(-se) |>
  mutate_if(is.numeric, boot::inv.logit)

# plot
p_prop_nonzero <- 
  ggplot(results_diff_zero, aes(scoring_method, estimate, 
                                #color = scoring_method, 
                                #shape = scoring_method, 
                                #group = scoring_method
  )) +
  geom_linerange(aes(ymin = pi_lower, ymax = pi_upper), size = 0.5, position = position_dodge(width = 0.8), linetype = "dotted") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper), size = 1.25, position = position_dodge(width = 0.8)) +
  geom_point(position = position_dodge(width = 0.8), size = 2.5) +
  mdthemes::md_theme_linedraw() +
  scale_y_continuous(breaks = c(0, .25, .5, .75, 1), labels = c("0.00<br/>(Worse)", "0.25", "0.50", "0.75", "1.00<br/>(Better)")) +
  #scale_color_viridis_d(alpha = 1, begin = 0.3, end = 0.7, labels = c("*D* scores", "PI scores")) +
  #scale_shape_manual(labels = c("*D* scores", "PI scores"), values = c(15, 16)) +
  scale_x_discrete(labels = c("IRAP D scores bootstrapped CIs", "IRAP D scores SEM CIs")) +
  labs(x = "",
       y = "Proportion of scores<br/>different from zero point<br/>") + 
  theme(legend.position = "none") +
  coord_flip(ylim = c(0, 1))

p_prop_nonzero

results_diff_zero %>%
  round_df(2) %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
scoring_method estimate ci_lower ci_upper pi_lower pi_upper
D scores bootstrapped CIs 0.10 0.07 0.14 0.01 0.46
D scores SEM CIs 0.12 0.08 0.16 0.02 0.51
# tests
data_emms_diff_zero <- emmeans(fit_diff_zero, list(pairwise ~ scoring_method), adjust = "holm") 

summary(data_emms_diff_zero)$`pairwise differences of scoring_method` %>%
  as.data.frame() %>%
  select(comparison = 1, p.value) %>%
  mutate(p.value = ifelse(p.value < .001, "< .001", round_half_up(p.value, 3))) %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
comparison p.value
(D scores bootstrapped CIs) - (D scores SEM CIs) < .001

Proportion different from one another

Within domain and trial type.

Many have argued that the zero point is arbitrary and not a useful reference point. Instead of asking “what proportion of D/PI scores are different from zero?”, we could also ask “what proportion of D/PI scores are different from one another?”

A common way to assess whether for differences between two estimates is to assess for non overlap between their confidence intervals. However, it has been repeatedly pointed out that this is less than ideal: there are situations where confidence intervals overlap slightly and yet the difference in means is significant.

Cornell Statistical Consulting Unit (2008) Overlapping Confidence Intervals and Statistical Significance argue this clearly. From their whitepaper:

The null hypothesis of zero mean difference is rejected when

\(|x_1 - x_2| > t \times \sqrt{SE_1^2 + SE_2^2}\)

The individual confidence intervals do not overlap when

\(|x_1 - x_2| > t \times (SE_1 + SE_2)\)

It can be shown that the following is always true:

\(\sqrt{SE_1^2 + SE_2^2} \le (SE_1 + SE_2)\)

This means that as \(|x_1 - x_2|\) increases there will be a point at which there is a significant difference between the means, but where the confidence intervals still overlaps. I.e., non overlapping confidence intervals indicate differences, but partially overlapping intervals do not exclude that there being differences.

As such, it is more appropriate and liberal to test for differences between each score and every other score (within the same domain and trial type) based on the CI of the difference scores rather than the non-overlap of intervals of the pair of scores.

Calculate discriminability

# # discriminability using non-overlap of CIs
# discriminability <- function(data, i) {
#   data_with_indexes <- data[i,] # boot function requires data and index
#   ci_lower <- data_with_indexes$ci_lower 
#   ci_upper <- data_with_indexes$ci_upper
#   n_ci_lower <- length(ci_lower)
#   n_ci_upper <- length(ci_upper)
#   r_ci_lower <- sum(rank(c(ci_lower, ci_upper))[1:n_ci_lower])
#   A <- (r_ci_lower / n_ci_lower - (n_ci_lower + 1) / 2) / n_ci_upper
#   return(A)
# }

# discriminatory using the significance of the difference score
# the goal here is to assess mean_diff > 1.96 * sqrt(SE1^2 + SE2^2 for every possible comparison EXCLUDING self comparisons. This is tricky to do within a typical tidyverse workflow as it means doing mutates involving each row of a column and every other row of that column but not the same row.
  # the below solution is to use expand.grid to find all combinations of a row with itself, and then use the modulus of the length of the row to filter out the self-pairings. Then do mutates on the rows to assess significant differences. It's enough to then summarize the proportion of significant results across all participants.
discriminability <- function(data, i) {
  data_with_indexes <- data[i,] # boot function requires data and index
  
  grid_estimates <- expand.grid(data_with_indexes$estimate, data_with_indexes$estimate) |>
    mutate(diff = Var1 - Var2,
           row_number = row_number(),
           modulus = row_number %% (nrow(data_with_indexes)+1)) |>
    filter(modulus != 1) |>
    select(diff)
  
  grid_se <- expand.grid(data_with_indexes$se, data_with_indexes$se) |>
    mutate(critical_value = 1.96 * sqrt(Var1^2 + Var2^2),
           row_number = row_number(),
           modulus = row_number %% (nrow(data_with_indexes)+1)) |>
    filter(modulus != 1) |>
    select(critical_value)
  
  proportion_sig_diff <- 
    bind_cols(grid_estimates, grid_se) |>
    mutate(sig = abs(diff) > critical_value) |>
    summarize(proportion_sig_diff = mean(sig)) |>
    pull(proportion_sig_diff)
  
  return(proportion_sig_diff)
}


bootstrap_discriminability <- function(data){
  
  require(dplyr)
  require(boot)
  
  fit <- 
    boot::boot(data      = data, 
               statistic = discriminability, 
               R         = 2000,
               sim       = "ordinary", 
               stype     = "i",
               parallel  = "multicore", 
               ncpus     = parallel::detectCores())
  
  results <- boot::boot.ci(fit, conf = 0.95, type = "perc") # bca bootstraps throw an error, so use next best

  output <-
    tibble(
      estimate = fit$t0,
      ci_lower = results$percent[4],
      ci_upper = results$percent[5]
    )
  
  return(output)
}

D scores with bootstrapped CIs

data_discriminability_D <- read_csv("../data/results/data_discriminability_D.csv") %>%
    filter(method == "bca")

D scores with SEM CIs

# bootstrapping has a long execution time, so load saved values if they've already been calculated
if(file.exists("../data/results/data_discriminability_D_sem.csv")) {
  
  data_discriminability_D_sem <- read_csv("../data/results/data_discriminability_D_sem.csv") %>%
    filter(method == "SEM")
  
} else {
  
  # bootstrap D scores 
  data_discriminability_D_sem <- data_estimates_D_sem |>
    mutate(method = "SEM",
           se = (ci_upper - ci_lower)/(1.96*2)) |>
    select(unique_id, method, domain, trial_type, estimate, se) |>
    group_by(method, domain, trial_type) |>
    do(bootstrap_discriminability(data = .)) |>
    ungroup() |>
    #filter(method == "bca") |>
    rename(proportion_discriminable = estimate) |>
    mutate(variance = ((ci_upper - ci_lower)/(1.96*2))^2,
           domain = as.factor(domain),
           trial_type = fct_relevel(trial_type, "tt1", "tt2", "tt3", "tt4"),
           scoring_method = "*D* scores SEM CIs") 
  
  # save to disk
  write_csv(data_discriminability_D_sem, "../data/results/data_discriminability_D_sem.csv")
  
  data_discriminability_D_sem <- data_discriminability_D_sem %>%
    filter(method == "SEM")
  
}

Plot

# combine
data_discriminability_combined <- 
  bind_rows(
    mutate(data_discriminability_D, scoring_method = "IRAP D scores bootstrapped CIs"),
    mutate(data_discriminability_D_sem, scoring_method = "IRAP D scores SEM CIs")
  ) %>%
  mutate(trial_type = case_when(trial_type == "tt1" ~ "Trial type 1",
                                trial_type == "tt2" ~ "Trial type 2",   
                                trial_type == "tt3" ~ "Trial type 3",   
                                trial_type == "tt4" ~ "Trial type 4")) %>%
  #filter(!(proportion_discriminable == 0 & variance == 0)) %>%
  mutate(variance = ifelse(variance == 0, 0.0001, variance)) |>
  # model cannot be run on zero variance or 0 or 1 logit, so offset by a minuscule amount
  mutate(
    proportion_discriminable_temp = case_when(proportion_discriminable < 0.001 ~ 0.001, 
                                              proportion_discriminable > 0.999 ~ 0.999,
                                              TRUE ~ proportion_discriminable),
    proportion_discriminable_logit = boot::logit(proportion_discriminable_temp)
  ) %>%
  select(-proportion_discriminable_temp)

p_discriminability <- 
  data_discriminability_combined %>%
  mutate(domain = fct_rev(factor(domain))) %>%
  ggplot(aes(proportion_discriminable, domain, color = scoring_method, shape = scoring_method)) +
  geom_linerangeh(aes(xmin = proportion_discriminable - sqrt(variance)*1.96,
                      xmax = proportion_discriminable + sqrt(variance)*1.96),
                  position = position_dodge(width = 0.75)) + 
  geom_point(position = position_dodge(width = 0.75)) +
  scale_shape_manual(labels = c("IRAP D scores bootstrapped CIs", "IRAP D scores SEM CIs"), values = c(15, 16)) +
  scale_color_viridis_d(begin = 0.3, end = 0.7, labels = c("IRAP D scores bootstrapped CIs", "IRAP D scores SEM CIs")) +
  mdthemes::md_theme_linedraw() +
  facet_wrap(~ trial_type, ncol = 4) +
  labs(x = "Proportion of scores<br/>differerent from other scores",
       y = "",
       color = "Scoring method",
       shape = "Scoring method") + 
  theme(legend.position = "top",
        panel.spacing = unit(1.5, "lines")) +
  coord_cartesian(xlim = c(0,1))

p_discriminability

Model

# fit meta analytic model
fit_disciminability <- 
  lmer(proportion_discriminable_logit ~ 1 + scoring_method + (1 | domain/trial_type), 
       weights = 1/variance, 
       data = data_discriminability_combined,
       # solution from https://www.metafor-project.org/doku.php/tips:rma_vs_lm_lme_lmer
       control = lmerControl(check.nobs.vs.nlev = "ignore",  
                             check.nobs.vs.nRE = "ignore"))

# extract marginal means
results_emm_disciminability <-
  summary(emmeans(fit_disciminability, ~ scoring_method)) %>%
  dplyr::select(scoring_method, estimate = emmean, se = SE, ci_lower = lower.CL, ci_upper = upper.CL) 

# extract re Tau
results_re_tau_disciminability <- fit_disciminability %>%
  merTools::REsdExtract() %>%
  as_tibble(rownames = "trial_type") %>%
  rename(tau = value) 

# combine
results_disciminability <- results_emm_disciminability %>%
  mutate(pi_lower = estimate - (1.96 * sqrt(se^2 + results_re_tau_disciminability$tau[2]^2)),  # as in metafor package's implementation of credibility intervals, see metafor::predict.rma.R
         pi_upper = estimate + (1.96 * sqrt(se^2 + results_re_tau_disciminability$tau[2]^2))) |>
  select(-se) |>
  mutate_if(is.numeric, boot::inv.logit)

# plot
p_prop_discriminable <-
  ggplot(results_disciminability, aes(scoring_method, estimate, 
                                      #color = scoring_method, 
                                      #shape = scoring_method, 
                                      #group = scoring_method
  )) +
  geom_linerange(aes(ymin = pi_lower, ymax = pi_upper), size = 0.5, position = position_dodge(width = 0.8), linetype = "dotted") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper), size = 1.25, position = position_dodge(width = 0.8)) +
  geom_point(position = position_dodge(width = 0.8), size = 2.5) +
  scale_y_continuous(breaks = c(0, .25, .5, .75, 1), labels = c("0.00<br/>(Worse)", "0.25", "0.50", "0.75", "1.00<br/>(Better)")) +
  #scale_shape_manual(labels = c("*D* scores", "PI scores"), values = c(15, 16)) +
  #scale_color_viridis_d(begin = 0.3, end = 0.7, labels = c("*D* scores", "PI scores")) +
  scale_x_discrete(labels = c("IRAP D scores bootstrapped CIs", "IRAP D scores SEM CIs")) +
  mdthemes::md_theme_linedraw() +
  labs(x = "",
       y = "Proportion of scores<br/>differerent from other scores<br/>") +
  theme(legend.position = "none") +
  coord_flip(ylim = c(0, 1))

p_prop_discriminable 

results_disciminability %>%
  round_df(2) %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
scoring_method estimate ci_lower ci_upper pi_lower pi_upper
IRAP D scores bootstrapped CIs 0.10 0.08 0.12 0.04 0.23
IRAP D scores SEM CIs 0.09 0.07 0.11 0.04 0.21
# tests
data_emms_disciminability <- emmeans(fit_disciminability, list(pairwise ~ scoring_method), adjust = "holm") 

summary(data_emms_disciminability)$`pairwise differences of scoring_method` %>%
  as.data.frame() %>%
  select(comparison = 1, p.value) %>%
  mutate(p.value = ifelse(p.value < .001, "< .001", round_half_up(p.value, 3))) %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
comparison p.value
IRAP D scores bootstrapped CIs - IRAP D scores SEM CIs < .001

CI widths as a proportion of observed range

NB observed range of confidence intervals

Calculate scores

# max range as example
data_estimates_D_sem %>%
  dplyr::summarize(min = min(ci_lower, na.rm = TRUE),
                   max = max(ci_upper, na.rm = TRUE),
                   .groups = "drop") %>%
  mutate(range = max - min) %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
min max range
-1.84 2.09 3.93
## calculate observed ranges 
observed_range_estimates_D <- data_estimates_D %>%
  group_by(domain, trial_type) %>%
  dplyr::summarize(min = min(ci_lower, na.rm = TRUE),
                   max = max(ci_upper, na.rm = TRUE),
                   .groups = "drop") %>%
  mutate(range = max - min) 

observed_range_estimates_D_sem <- data_estimates_D_sem %>%
  group_by(domain, trial_type) %>%
  dplyr::summarize(min = min(ci_lower, na.rm = TRUE),
                   max = max(ci_upper, na.rm = TRUE),
                   .groups = "drop") %>%
  mutate(range = max - min) 

# calculate CI / range 
data_ci_width_proportions_D <- data_estimates_D %>%
  # join this data into the original data
  full_join(observed_range_estimates_D, by = c("domain", "trial_type")) %>%
  # calculate ci width as a proportion of observed range
  mutate(ci_width_proportion = ci_width / range) %>%
  mutate(scoring_method = "*D* scores bootstrapped CIs") 

data_ci_width_proportions_D_sem <- data_estimates_D_sem %>%
  # join this data into the original data
  full_join(observed_range_estimates_D_sem, by = c("domain", "trial_type")) %>%
  # calculate ci width as a proportion of observed range
  mutate(ci_width_proportion = ci_width / range) %>%
  mutate(scoring_method = "*D* scores SEM CIs") 

# combine
data_ci_width_proportions_combined <- 
  bind_rows(data_ci_width_proportions_D,
            data_ci_width_proportions_D_sem) %>%
  mutate(domain = as.factor(domain),
         trial_type = fct_relevel(trial_type, "tt1", "tt2", "tt3", "tt4")) %>%
  group_by(scoring_method, domain, trial_type) %>%
  summarize(ci_width_proportion_mean = mean(ci_width_proportion),
            variance = plotrix::std.error(ci_width_proportion)^2) %>%
  ungroup() %>%
  mutate(variance = ifelse(variance == 0, 0.0001, variance)) %>%
  # logit transform
  mutate(ci_width_proportion_mean_temp = case_when(ci_width_proportion_mean < 0.0001 ~ 0.0001,
                                                   ci_width_proportion_mean > 0.9999 ~ 0.9999,
                                                   TRUE ~ ci_width_proportion_mean),
         ci_width_proportion_mean_logit = boot::logit(ci_width_proportion_mean_temp)) %>%
  select(-ci_width_proportion_mean_temp)

write_csv(data_ci_width_proportions_combined, "../data/results/data_ci_width_proportions_irap_d_boot_vs_sem.csv")
#data_ci_width_proportions_combined <- read_csv("../data/results/data_ci_width_proportions_irap_d_boot_vs_sem.csv")

Plot

p_coverage <- 
  data_ci_width_proportions_combined %>%
  mutate(domain = fct_rev(factor(domain))) %>%
  mutate(trial_type = case_when(trial_type == "tt1" ~ "Trial type 1",
                                trial_type == "tt2" ~ "Trial type 2",   
                                trial_type == "tt3" ~ "Trial type 3",   
                                trial_type == "tt4" ~ "Trial type 4")) %>%
  ggplot(aes(ci_width_proportion_mean, domain, color = scoring_method, shape = scoring_method)) +
  geom_point(position = position_dodge(width = 0.75)) +
  scale_shape_manual(labels = c("*D* scores boostrapped CIs", "*D* scores SEM CIs"), values = c(15, 16)) +
  geom_linerangeh(aes(xmin = ci_width_proportion_mean - sqrt(variance)*1.96,
                      xmax = ci_width_proportion_mean + sqrt(variance)*1.96),
                  position = position_dodge(width = 0.75)) + 
  scale_color_viridis_d(begin = 0.3, end = 0.7, labels = c("*D* scores boostrapped CIs", "*D* scores SEM CIs")) +
  mdthemes::md_theme_linedraw() +
  facet_wrap(~ trial_type, ncol = 4) +
  labs(x = "Proportion of observed range covered<br/>by individual scores' 95% CIs",
       y = "",
       color = "Scoring method",
       shape = "Scoring method") + 
  theme(legend.position = "top",
        panel.spacing = unit(1.5, "lines")) +
  coord_cartesian(xlim = c(0,1))

p_coverage

Model

# fit model
fit_ci_width_proportions <- 
  lmer(ci_width_proportion_mean_logit ~ 1 + scoring_method + (1 | domain/trial_type), 
       weights = 1/variance,
       data = data_ci_width_proportions_combined,
       # solution from https://www.metafor-project.org/doku.php/tips:rma_vs_lm_lme_lmer
       control = lmerControl(check.nobs.vs.nlev = "ignore",  
                             check.nobs.vs.nRE = "ignore"))

# extract marginal means
results_emm_ci_width_proportions <-
  summary(emmeans(fit_ci_width_proportions, ~ scoring_method)) %>%
  dplyr::select(scoring_method, estimate = emmean, se = SE, ci_lower = lower.CL, ci_upper = upper.CL)

# extract re Tau
results_re_tau_ci_width_proportions <-
  merTools::REsdExtract(fit_ci_width_proportions) %>%
  as_tibble(rownames = "trial_type") %>%
  rename(tau = value)

# combine
results_ci_width_proportions <- results_emm_ci_width_proportions %>%
  mutate(pi_lower = estimate - (1.96 * sqrt(se^2 + results_re_tau_ci_width_proportions$tau[2]^2)),  # as in metafor package's implementation of credibility intervals, see metafor::predict.rma.R
         pi_upper = estimate + (1.96 * sqrt(se^2 + results_re_tau_ci_width_proportions$tau[2]^2))) %>%
  select(-se) %>%
  mutate_if(is.numeric, boot::inv.logit)

# plot
p_ci_width_proportion_observed_range <-
  ggplot(results_ci_width_proportions, aes(scoring_method, estimate, 
                                           #color = scoring_method, 
                                           #shape = scoring_method, 
                                           #group = scoring_method
  )) +
  geom_linerange(aes(ymin = pi_lower, ymax = pi_upper), size = 0.5, position = position_dodge(width = 0.8), linetype = "dotted") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper), size = 1.25, position = position_dodge(width = 0.8)) +
  geom_point(position = position_dodge(width = 0.8), size = 2.5) +
  scale_shape_discrete(labels = c("IRAP D scores bootstrapped CIs", "IRAP D scores SEM CIs")) +
  scale_y_continuous(breaks = c(0, .25, .5, .75, 1), labels = c("0.00<br/>(Better)", "0.25", "0.50", "0.75", "1.00<br/>(Worse)")) +
  #scale_shape_manual(labels = c("*D* scores", "PI scores"), values = c(15, 16)) +
  #scale_color_viridis_d(begin = 0.3, end = 0.7, labels = c("*D* scores", "PI scores")) +
  scale_x_discrete(labels = c("IRAP D scores bootstrapped CIs", "IRAP D scores SEM CIs")) +
  mdthemes::md_theme_linedraw() +
  labs(x = "",
       y = "Proportion of observed range covered<br/>by individual scores' 95% CIs") +
  theme(legend.position = "none") +
  coord_flip(ylim = c(0, 1))

p_ci_width_proportion_observed_range

results_ci_width_proportions %>%
  round_df(2) %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
scoring_method estimate ci_lower ci_upper pi_lower pi_upper
D scores bootstrapped CIs 0.51 0.50 0.53 0.43 0.60
D scores SEM CIs 0.45 0.43 0.46 0.36 0.53
# tests
data_emms_ci_width_proportions <- emmeans(fit_ci_width_proportions, list(pairwise ~ scoring_method), adjust = "holm") 

summary(data_emms_ci_width_proportions)$`pairwise differences of scoring_method` %>%
  as.data.frame() %>%
  select(comparison = 1, p.value) %>%
  mutate(p.value = ifelse(p.value < .001, "< .001", round_half_up(p.value, 3))) %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
comparison p.value
(D scores bootstrapped CIs) - (D scores SEM CIs) < .001

Combined plots

Supplementary figure 7S

p_cis_by_domain

ggsave(filename  = "plots/supplementary_figure_7S_cis_by_domain_irap_d_boot_vs_sem.pdf",
       plot      = p_cis_by_domain,
       device    = "pdf",
       # path      = NULL,
       # dpi       = 300,
       units     = "in",
       width     = 10,
       height    = 10,
       limitsize = TRUE)

Supplementary Figure 8S

p_combined <- 
  p_prop_nonzero + 
  p_prop_discriminable + 
  p_ci_width_proportion_observed_range +
  plot_layout(ncol = 1) #, guides = "collect") & theme(legend.position = "top")

p_combined

ggsave(filename  = "plots/supplementary_figure_8_metaanalyses_irap_d_boot_vs_sem.pdf",
       plot      = p_combined,
       device    = "pdf",
       # path      = NULL,
       # dpi       = 300,
       units     = "in",
       width     = 5,
       height    = 5,
       limitsize = TRUE)

Session info

sessionInfo()
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] janitor_2.1.0     ggstance_0.3.5    emmeans_1.7.5     sjPlot_2.8.10    
##  [5] lme4_1.1-30       Matrix_1.4-1      mdthemes_0.1.0    patchwork_1.1.1  
##  [9] bayestestR_0.13.0 boot_1.3-28       kableExtra_1.3.4  knitr_1.39       
## [13] forcats_0.5.1     stringr_1.4.1     dplyr_1.0.9       purrr_0.3.4      
## [17] readr_2.1.2       tidyr_1.2.0       tibble_3.1.8      ggplot2_3.3.6    
## [21] tidyverse_1.3.2  
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.4.0        backports_1.4.1     blme_1.0-5         
##   [4] systemfonts_1.0.4   splines_4.2.1       listenv_0.8.0      
##   [7] TH.data_1.1-1       digest_0.6.29       foreach_1.5.2      
##  [10] htmltools_0.5.3     fansi_1.0.3         magrittr_2.0.3     
##  [13] googlesheets4_1.0.0 tzdb_0.3.0          globals_0.15.1     
##  [16] modelr_0.1.8        vroom_1.5.7         sandwich_3.0-2     
##  [19] svglite_2.1.0       colorspace_2.0-3    rvest_1.0.2        
##  [22] textshaping_0.3.6   haven_2.5.0         xfun_0.31          
##  [25] crayon_1.5.1        jsonlite_1.8.0      survival_3.3-1     
##  [28] zoo_1.8-10          iterators_1.0.14    glue_1.6.2         
##  [31] gtable_0.3.0        gargle_1.2.0        webshot_0.5.3      
##  [34] sjstats_0.18.1      sjmisc_2.8.9        abind_1.4-5        
##  [37] scales_1.2.0        mvtnorm_1.1-3       DBI_1.1.3          
##  [40] ggeffects_1.1.2     Rcpp_1.0.9          plotrix_3.8-2      
##  [43] viridisLite_0.4.0   xtable_1.8-4        performance_0.9.1  
##  [46] merTools_0.5.2      gridtext_0.1.4      bit_4.0.4          
##  [49] datawizard_0.6.1    httr_1.4.3          ellipsis_0.3.2     
##  [52] pkgconfig_2.0.3     farver_2.1.1        sass_0.4.2         
##  [55] dbplyr_2.2.1        utf8_1.2.2          tidyselect_1.1.2   
##  [58] labeling_0.4.2      rlang_1.0.4         later_1.3.0        
##  [61] effectsize_0.7.0.5  munsell_0.5.0       cellranger_1.1.0   
##  [64] tools_4.2.1         cachem_1.0.6        cli_3.3.0          
##  [67] generics_0.1.3      sjlabelled_1.2.0    broom_1.0.0        
##  [70] evaluate_0.15       fastmap_1.1.0       arm_1.12-2         
##  [73] yaml_2.3.5          ragg_1.2.2          bit64_4.0.5        
##  [76] fs_1.5.2            future_1.27.0       nlme_3.1-157       
##  [79] mime_0.12           xml2_1.3.3          compiler_4.2.1     
##  [82] pbkrtest_0.5.1      rstudioapi_0.13     reprex_2.0.1       
##  [85] bslib_0.4.0         stringi_1.7.8       highr_0.9          
##  [88] parameters_0.18.2   lattice_0.20-45     nloptr_2.0.3       
##  [91] markdown_1.1        vctrs_0.4.1         pillar_1.8.0       
##  [94] lifecycle_1.0.1     furrr_0.3.0         jquerylib_0.1.4    
##  [97] estimability_1.4    insight_0.18.4      httpuv_1.6.5       
## [100] R6_2.5.1            promises_1.2.0.1    parallelly_1.32.1  
## [103] codetools_0.2-18    MASS_7.3-57         assertthat_0.2.1   
## [106] withr_2.5.0         multcomp_1.4-20     broom.mixed_0.2.9.4
## [109] hms_1.1.1           ggtext_0.1.1        grid_4.2.1         
## [112] coda_0.19-4         minqa_1.2.4         rmarkdown_2.14     
## [115] snakecase_0.11.0    googledrive_2.0.0   shiny_1.7.2        
## [118] lubridate_1.8.0